home *** CD-ROM | disk | FTP | other *** search
Text File | 1993-03-10 | 5.6 KB | 182 lines | [TEXT/KAHL] |
-
- //******************************************
- // Unique word count by Ronald M. Nepsund
- // please do not publish my address and phone number
- // Ronald M. Nepsund
- // 19229 Nordhoff St. #35
- // Northridge,Ca 91324
- // Work phone (818) 701-5051
- // Home phone (818) 885-0810
- //The original text is uppercased with all non alphabetic characters
- //turned to zero.
- //The list of unique words works as follows
- //The word is used to to index into a 1024 entry hash table.
- //The hash table contains pointers to the beginning of a linked
- //list of word entries. Each word entry holds the length of
- //the word and a pointer to the first occurance of the word
- //in the original text. The linked list is ordered first by the
- //length of the words and secondly alphabetically
-
- #define kHashMask 0x003FF
- #define kHashSize 0x00400
-
- typedef struct tNode{
- short length;
- Ptr wordPtr;
- struct tNode *nextNode;
- } WordTableRec,*WordTablePtr;
-
-
- //compare two strings
- short stringCmp(register short length,
- register char *pnt1, register char *pnt2)
- {
- while (--length > 0 && *pnt1==*pnt2) {
- pnt1++; pnt2++;
- }
- return *pnt1-*pnt2;
- }
-
- unsigned short CountUniqueWords(Ptr textPtr,unsigned short byteCount)
- {
- Ptr endPnt,wordStartPtr;
- register char *charPnt;
- register long hash;
- long count;
- register short wordLength;
- unsigned short totalWords = 0;
- short i,cmp;
- WordTablePtr *hashTable;
- WordTablePtr wordTable;
- register WordTablePtr linkPtr;
- WordTablePtr last,wordTableEntry;
- char charTypeTable[256];
- long *zeroTblPtr;
- //total possable number of words = 16556
- //assuming a maximum size buffer of text
- //I will use a hash table size of 1024
-
- count = 4L * kHashSize;
- hashTable = (WordTablePtr *)NewPtrClear(count); //array of (NULL)Ptr's
- if (hashTable == 0L)
- while (TRUE)
- SysBeep(10); //not enough memory
-
- count = 16556L * sizeof(WordTableRec);
- wordTable = (WordTablePtr)NewPtr(count);
-
- if (wordTable == 0L)
- while (TRUE)
- SysBeep(10); //not enough memory
- //zero out the 'charTypeTable'
- zeroTblPtr = (long *)&charTypeTable;
- for (i=0; i<64; i++)
- *zeroTblPtr++ = 0;
-
- //init lookup table used for uppercasing and identifying non letter characters
- for (i=0;i<26; i++)
- charTypeTable[i+(Byte)'A'] =
- charTypeTable[i+(Byte)'a'] = (char)(i+((Byte)'A'));
-
- totalWords = 0;
- charPnt = textPtr;
- endPnt = textPtr + byteCount;
- while (charPnt < endPnt) {
- //skip spaces
- while (charPnt < endPnt && charTypeTable[*charPnt] == 0)
- charPnt++;
- wordStartPtr = charPnt; //pointer to begining of new word
- hash = 0;
- if (endPnt-charPnt > 255) { //at least 255 characters left
- //advance to the end of the word
- //we can assume that a word is <=255 characters long
- //uppercasing the text
- //and doing a hash function
- while (*charPnt = charTypeTable[*charPnt])
- hash = (hash << 4) + *charPnt++ - 'A' + (hash >> 10);
-
- } else {
- while (charPnt < endPnt && (*charPnt = charTypeTable[*charPnt]))
- hash = (hash << 4) + *charPnt++ - 'A' + (hash >> 10);
- }
- wordLength = charPnt-wordStartPtr;
-
- hash &= kHashMask;
- //hash &= 0x3FFF;
- linkPtr = (WordTablePtr)hashTable[hash];
-
- last = NULL;
- if (linkPtr==NULL) { //this hash entry has not been used yet
- wordTableEntry = &wordTable[totalWords++];
- hashTable[hash] = wordTableEntry;
- wordTableEntry->length = wordLength;
- wordTableEntry->wordPtr = wordStartPtr;
- wordTableEntry->nextNode = 0;
- } else {
- //the entries are ordered by word length
- //find the first entry that is as long or longer
- while (linkPtr != NULL && linkPtr->length < wordLength) {
- last = linkPtr;
- linkPtr = linkPtr->nextNode;
- }
- if (linkPtr == NULL) { //no other entries as long as this one
- wordTableEntry = &wordTable[totalWords++];
- if (last == NULL)
- hashTable[hash] = wordTableEntry;
- else
- last->nextNode = wordTableEntry;
- wordTableEntry->length = wordLength;
- wordTableEntry->wordPtr= wordStartPtr;
- wordTableEntry->nextNode= NULL;
- } else
- if (linkPtr->length > wordLength){
- //insert betean 'last' and 'linkPtr'
- wordTableEntry = &wordTable[totalWords++];
- if (last == NULL) {
- wordTableEntry->nextNode = hashTable[hash];
- hashTable[hash] = wordTableEntry;
- } else {
- wordTableEntry->nextNode = last->nextNode;
- last->nextNode = wordTableEntry;
- }
- wordTableEntry->length = wordLength;
- wordTableEntry->wordPtr= wordStartPtr;
- } else { //check entries with same word lengths
- while (linkPtr != NULL && linkPtr->length == wordLength &&
- (cmp = stringCmp(wordLength,wordStartPtr,linkPtr->wordPtr)) <0) {
- last = linkPtr;
- linkPtr = linkPtr->nextNode;
- }
- if (linkPtr == NULL) { //end of list and no match
- wordTableEntry = &wordTable[totalWords++];
- if (last == NULL)
- hashTable[hash] = wordTableEntry;
- else
- last->nextNode = wordTableEntry;
- wordTableEntry->length = wordLength;
- wordTableEntry->wordPtr= wordStartPtr;
- wordTableEntry->nextNode= 0;
- } else
- if (linkPtr->length > wordLength || cmp>0) {
- //insert word entry here
- wordTableEntry = &wordTable[totalWords++];
- if (last == NULL) { //hash table to point here
- wordTableEntry->nextNode = hashTable[hash];
- hashTable[hash] = wordTableEntry;
- } else {
- wordTableEntry->nextNode= last->nextNode;
- last->nextNode = wordTableEntry;
- }
- wordTableEntry->length = wordLength;
- wordTableEntry->wordPtr = wordStartPtr;
- }
- }
- }
- }
-
- DisposePtr((Ptr)hashTable);
- DisposePtr((Ptr)wordTable);
- return totalWords;
- }
-
-